#This script prepares the data for the DSL analyses using the LLM annotations
#It produces two .rds files with the data for Tunisia and Egypt
library(dplyr)
library(quanteda)
library(conText)
library(text2vec)
library(tsibble)
library(lubridate)
source("utils.R")

# Prepare Tunisia dsl data

data_merged_gpt <- read.csv("data/output/cos_sims_dsl/turessdata_sample_gpt.csv")
data_merged_gpt <- data_merged_gpt[, c(-1,-12, -13)]
data_merged_gpt$sample <- "gpt"

data_merged <- readRDS("data/output/cos_sims_dsl/turessdata_full.rds")
data_merged <- data_merged[, c(-11)]

data_merged <- data_merged %>%
  mutate(sample = "cosine",
         scorefac_gpt = NA,
         scoreint_gpt = NA)

data_final <- rbind(data_merged, data_merged_gpt)

data_final <- data_final %>%
  select(ID, content, date, yearmon, yearwk, year, newspaper, 
         cos_sim2, scoreint_gpt, scorefac_gpt, sample) %>%
  filter(is.na(scoreint_gpt) | scoreint_gpt != 12)

saveRDS(data_final, "data/output/cos_sims_dsl/turessdata_final.rds")

# Prepare Egypt dsl data

data_merged_gpt <- read.csv("data/output/cos_sims_dsl/masressdata_sample_gpt.csv")
data_merged_gpt <- data_merged_gpt[, c(-1,-2, -13, -14, -15)]
data_merged_gpt$sample <- "gpt"

data_merged <- readRDS("data/output/cos_sims_dsl/masressdata_full.rds")
data_merged <- data_merged[, c(-11)]

data_merged <- data_merged %>%
  mutate(sample = "cosine",
         scorefac_gpt = NA,
         scoreint_gpt = NA)

data_final <- rbind(data_merged, data_merged_gpt)

data_final <- data_final %>%
  select(ID, content, date, yearmon, yearwk, year, newspaper, 
         cos_sim2, scoreint_gpt, scorefac_gpt, sample)  %>%
  filter(is.na(scoreint_gpt) | scoreint_gpt != 12)

saveRDS(data_final, "data/output/cos_sims_dsl/masressdata_final.rds")